ANA 680 - FINAL PROJECT by Demetri Papamichalis¶

In [1]:
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale,StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
In [2]:
# get working directory
from pathlib import Path
print(Path.cwd())
C:\Users\dpapa\ANA 680_material Nov '22\WK4 FINAL
In [3]:
# or another way...
import os
cwd = os.getcwd()
print(cwd)
C:\Users\dpapa\ANA 680_material Nov '22\WK4 FINAL
In [4]:
# change working dir
os.chdir("C:\\Users\\dpapa") 

cwd = os.getcwd()
print(cwd)
C:\Users\dpapa
In [5]:
# Load the dataset. Cells with content “NULL”, “?”, or “NONE” or just blank space will be interpreted as np.NAN
df=pd.read_csv(r"C:\Users\dpapa\water_quality_dataset.csv", na_values=['NULL', '?', 'NONE'])
In [6]:
df
Out[6]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 NaN 392.449580 19.903225 NaN 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 NaN 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 NaN 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 NaN 327.459760 16.140368 78.698446 2.309149 1

3276 rows × 10 columns

In [7]:
df.shape
Out[7]:
(3276, 10)
In [8]:
df.columns
Out[8]:
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
In [9]:
columns =list(df.columns)
columns
Out[9]:
['ph',
 'Hardness',
 'Solids',
 'Chloramines',
 'Sulfate',
 'Conductivity',
 'Organic_carbon',
 'Trihalomethanes',
 'Turbidity',
 'Potability']
In [10]:
# look for missing values
df.info() 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
In [11]:
# find which column has missing values
print(df.isnull().sum().sort_values(ascending=False).to_string())
Sulfate            781
ph                 491
Trihalomethanes    162
Hardness             0
Solids               0
Chloramines          0
Conductivity         0
Organic_carbon       0
Turbidity            0
Potability           0
In [12]:
# impute missing values for "Bare Nuclei" feature with the MEAN
missing_col = ['Sulfate','ph','Trihalomethanes']
for i in missing_col:
    df.loc[df.loc[:,i].isnull(),i] = df.loc[:,i].mean()
In [13]:
# check if imputation was success by seeing if we still have missing values:
# find which column has missing values
print(df.isnull().sum().sort_values(ascending=False).to_string())
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
In [14]:
# general statistics for our columns/features
df.describe().transpose()
Out[14]:
count mean std min 25% 50% 75% max
ph 3276.0 7.080795 1.469956 0.000000 6.277673 7.080795 7.870050 14.000000
Hardness 3276.0 196.369496 32.879761 47.432000 176.850538 196.967627 216.667456 323.124000
Solids 3276.0 22014.092526 8768.570828 320.942611 15666.690297 20927.833607 27332.762127 61227.196008
Chloramines 3276.0 7.122277 1.583085 0.352000 6.127421 7.130299 8.114887 13.127000
Sulfate 3276.0 333.775777 36.142612 129.000000 317.094638 333.775777 350.385756 481.030642
Conductivity 3276.0 426.205111 80.824064 181.483754 365.734414 421.884968 481.792304 753.342620
Organic_carbon 3276.0 14.284970 3.308162 2.200000 12.065801 14.218338 16.557652 28.300000
Trihalomethanes 3276.0 66.396293 15.769881 0.738000 56.647656 66.396293 76.666609 124.000000
Turbidity 3276.0 3.966786 0.780382 1.450000 3.439711 3.955028 4.500320 6.739000
Potability 3276.0 0.390110 0.487849 0.000000 0.000000 0.000000 1.000000 1.000000
In [15]:
# histogram with specified number of bins
df.hist(bins=20, figsize=(15, 15))
Out[15]:
array([[<AxesSubplot:title={'center':'ph'}>,
        <AxesSubplot:title={'center':'Hardness'}>,
        <AxesSubplot:title={'center':'Solids'}>],
       [<AxesSubplot:title={'center':'Chloramines'}>,
        <AxesSubplot:title={'center':'Sulfate'}>,
        <AxesSubplot:title={'center':'Conductivity'}>],
       [<AxesSubplot:title={'center':'Organic_carbon'}>,
        <AxesSubplot:title={'center':'Trihalomethanes'}>,
        <AxesSubplot:title={'center':'Turbidity'}>],
       [<AxesSubplot:title={'center':'Potability'}>, <AxesSubplot:>,
        <AxesSubplot:>]], dtype=object)
In [16]:
cat_list=[]
num_list=[]

for column_name in df.columns:
    
    unique_values=len(df[column_name].unique())

        
    if unique_values<5:
        cat_list.append(column_name)
    else:
        num_list.append(column_name)
In [17]:
k=1
plt.figure(figsize=(15,15))
plt.suptitle("Distribution of numerical variables")

for i in df.loc[:,num_list]:
    plt.subplot(3,3,k)
    sns.distplot(df[i])
    plt.title(i)
    k+=1
In [18]:
non_potable=df.query("Potability ==0")
potable=df.query("Potability==1")

plt.figure(figsize=(20,20))
for ax, col in enumerate(df.columns[:9]):
    plt.subplot(3,3, ax + 1)
    plt.title(col)
    sns.kdeplot(x = non_potable[col], label = "Non Potable")
    sns.kdeplot(x = potable[col], label = "Potable")
    plt.legend()
    plt.tight_layout()
In [19]:
# distribution for "Class"
op_count = df['Potability'].value_counts()
plt.figure(figsize=(10,5))
sns.barplot(x=op_count.index,y= op_count.values)
plt.title('Potability')
plt.ylabel('occurances by potability value', fontsize=12)
plt.xlabel('potability', fontsize=12)
plt.xticks(ticks=[0, 1], labels = ["Not Potable [0]", "Potable [1]"])
plt.show()
In [20]:
import plotly.express as px
d = pd.DataFrame(df["Potability"].value_counts())
fig = px.pie(d, values = "Potability", names = ["Not Potable", "Potable"], hole = 0.35, opacity = 0.8,
            labels = {"label" :"Potability","Potability":"Number of Samples"})
fig.update_layout(title = dict(text = "Pie Chart of Potability Feature"))
fig.update_traces(textposition = "outside", textinfo = "percent+label")
fig.show()
In [21]:
# generate boxplot for individual features 
df.boxplot(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity'], figsize=(15, 10))
Out[21]:
<AxesSubplot:>
In [22]:
# Examine outliers for each variable in more detail:

#pH column
fig, axes = plt.subplots(1,2)
sns.histplot(x = df['ph'], kde = True, ax = axes[0])

#define figure size
sns.set(rc={"figure.figsize":(15, 5)}) #width=6, height=5
sns.boxplot(x = df['ph'], ax = axes[1])
Out[22]:
<AxesSubplot:xlabel='ph'>
In [23]:
# lets put this into a loop for the rest of the variables:
plt.figure(figsize=(10,10))
sns.set_style('dark')
i = 1
for col in df.columns:
    fig, axes = plt.subplots(1,2)
    sns.boxplot(x=df[col])
    sns.histplot(x = df[col], kde = True, ax = axes[0])

    i+=1
<Figure size 720x720 with 0 Axes>
In [24]:
# CLEAN OUTLIERS 

# Identify values greater than max + 1.5* Inetrquartile range and  less that min - 1.5* interquartile range
i = 1;
for col in df.columns:
    
    for x in [col]:
        q75,q25 = np.percentile(df.loc[:,x],[75,25])
        intr_qr = q75-q25

        max = q75+(1.5*intr_qr)
        min = q25-(1.5*intr_qr)

        df.loc[df[x] < min,x] = df[col].mean()   # df.dropna(axis=0)
        df.loc[df[x] > max,x] = df[col].mean()
print(df.isnull().sum().sort_values(ascending=False).to_string())
    
i+=1
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
In [25]:
#lets recheck if outliers have been removed

# lets put this into a loop:
plt.figure(figsize=(18,24))
sns.set_style('dark')
i = 1
for col in df.columns:
    fig, axes = plt.subplots(1,2)
    sns.boxplot(x=df[col])
    sns.histplot(x = df[col], kde = True, ax = axes[0])

    i+=1
<Figure size 1296x1728 with 0 Axes>
In [26]:
# general statistics for our columns/features after inputation and outlier removal:
df.describe().transpose()
Out[26]:
count mean std min 25% 50% 75% max
ph 3276.0 7.074315 1.212530 3.902476 6.376857 7.080795 7.762369 10.252816
Hardness 3276.0 196.518697 29.401877 117.791230 178.222090 196.967627 215.593162 275.886513
Solids 3276.0 21629.750926 8137.573905 320.942611 15666.690297 20927.833607 26957.576932 44652.363872
Chloramines 3276.0 7.123431 1.445700 3.181183 6.179765 7.130299 8.076082 11.086526
Sulfate 3276.0 333.813477 25.541204 267.202392 321.856816 333.775777 343.827461 400.274579
Conductivity 3276.0 425.500720 79.449495 201.619737 365.811312 421.926811 480.855683 652.537592
Organic_carbon 3276.0 14.286139 3.193340 5.362371 12.094010 14.246387 16.517104 23.234326
Trihalomethanes 3276.0 66.468165 14.609623 27.095703 57.201524 66.396293 76.336831 106.371720
Turbidity 3276.0 3.967260 0.759414 1.872573 3.444882 3.959577 4.494481 6.083772
Potability 3276.0 0.390110 0.487849 0.000000 0.000000 0.000000 1.000000 1.000000
In [27]:
import seaborn as sns
#plt.figure(figsize=(7,5))
sns.heatmap(df.corr()[['Potability']], annot=True)
plt.show()
In [28]:
# correlation matrix and heatmap for feature selection
corr=df.corr()
plt.figure(figsize=(10, 6))
heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap="coolwarm")
In [29]:
print("Most positively correlated features with the target variable: Potability")
corr.sort_values(['Potability'], ascending=False, inplace=True)
corr.Potability
Most positively correlated features with the target variable: Potability
Out[29]:
Potability         1.000000
Solids             0.025442
Chloramines        0.021136
Trihalomethanes    0.006346
Turbidity          0.004444
Sulfate            0.002171
ph                -0.004099
Conductivity      -0.008891
Hardness          -0.012713
Organic_carbon    -0.027319
Name: Potability, dtype: float64
In [30]:
# Only focus on features with correlation factor above 0.5
#top_feature = corr.index[corr['Potability']>0.2]
#plt.subplots(figsize=(10, 5))
#top_corr = df[top_feature].corr()
#sns.heatmap(top_corr, annot=True)

Multivariate¶

In [31]:
# Pairplot
sns.pairplot(df, hue = 'Potability')
Out[31]:
<seaborn.axisgrid.PairGrid at 0x1510862ffd0>

Define variables and target¶

In [32]:
X=df.drop("Potability",axis=1).values
y=df["Potability"].values
In [33]:
print(X)
print(y)
[[7.08079450e+00 2.04890455e+02 2.07913190e+04 ... 1.03797831e+01
  8.69909705e+01 2.96313538e+00]
 [7.08079450e+00 1.29422921e+02 1.86300579e+04 ... 1.51800131e+01
  5.63290763e+01 4.50065627e+00]
 [8.09912419e+00 2.24236259e+02 1.99095417e+04 ... 1.68686369e+01
  6.64200925e+01 3.05593375e+00]
 ...
 [9.41951032e+00 1.75762646e+02 3.31555782e+04 ... 1.10390697e+01
  6.98454003e+01 3.29887550e+00]
 [5.12676292e+00 2.30603758e+02 1.19838694e+04 ... 1.11689462e+01
  7.74882131e+01 4.70865847e+00]
 [7.87467136e+00 1.95102299e+02 1.74041771e+04 ... 1.61403676e+01
  7.86984463e+01 2.30914906e+00]]
[0. 0. 0. ... 1. 1. 1.]
In [34]:
print(X[:,1])
[204.89045547 129.42292051 224.23625939 ... 175.7626463  230.60375751
 195.10229859]
In [35]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)
print("X_train",X_train.shape)
print("X_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

# plot 1 variable: 1st which is 'ph' (all rows, first column)
sns.histplot(x = X_train[:,1], kde = True)
X_train (2620, 9)
X_test (656, 9)
y_train (2620,)
y_test (656,)
Out[35]:
<AxesSubplot:ylabel='Count'>
In [36]:
print(X_train)
[[5.79115437e+00 1.90431679e+02 2.02882356e+04 ... 1.77068383e+01
  8.43881914e+01 4.10483482e+00]
 [7.08079450e+00 1.87043283e+02 2.46345524e+04 ... 1.10539246e+01
  7.41081803e+01 3.49871682e+00]
 [8.39724810e+00 1.99495811e+02 1.67722262e+04 ... 1.72360472e+01
  7.58535486e+01 2.83579428e+00]
 ...
 [7.38010483e+00 2.37922637e+02 1.68625435e+04 ... 1.32544754e+01
  9.28645571e+01 5.03045220e+00]
 [8.03105522e+00 2.22628424e+02 1.81348350e+04 ... 1.20659628e+01
  7.85497122e+01 3.33888013e+00]
 [6.20357285e+00 1.39129083e+02 6.69823910e+03 ... 1.33681645e+01
  6.82986891e+01 4.30554936e+00]]
In [37]:
# min-max normalization
x_train_max = np.max(X_train)
x_train_min = np.min(X_train)
X_train = (X_train - x_train_min)/(x_train_max-x_train_min)
X_test = (X_test - x_train_min)/(x_train_max-x_train_min)

print("X_train",X_train.shape)
print("X_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

# plot 1 variable: 1st which is 'ph' (all rows, first column)
sns.histplot(x = X_train[:,1], kde = True)
X_train (2620, 9)
X_test (656, 9)
y_train (2620,)
y_test (656,)
Out[37]:
<AxesSubplot:ylabel='Count'>
In [38]:
print(X_train)
[[8.78391522e-05 4.22675167e-03 4.54740268e-01 ... 3.54941801e-04
  1.84967481e-03 5.00385170e-05]
 [1.16747799e-04 4.15079720e-03 5.52167555e-01 ... 2.05809709e-04
  1.61923750e-03 3.64517414e-05]
 [1.46257498e-04 4.42993377e-03 3.75925203e-01 ... 3.44388521e-04
  1.65836177e-03 2.15916321e-05]
 ...
 [1.23457156e-04 5.29131169e-03 3.77949759e-01 ... 2.55137381e-04
  2.03968151e-03 7.07872090e-05]
 [1.38048897e-04 4.94847575e-03 4.06469519e-01 ... 2.28495615e-04
  1.71879913e-03 3.28688330e-05]
 [9.70839482e-05 3.07674976e-03 1.50106133e-01 ... 2.57685844e-04
  1.48901161e-03 5.45377457e-05]]

try several models and see the results¶

In [39]:
models =[("LR", LogisticRegression()),
         ("SVC", SVC()),
         ("gbm",GradientBoostingClassifier()),
         ('KNN',KNeighborsClassifier()),
         ("DTC", DecisionTreeClassifier()),
         ("BNB", BernoulliNB()),
         ('RFC',RandomForestClassifier())]
results = []
names = []
finalResults = []

for name,model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    accuracy = round(accuracy_score(y_test,y_pred)*100,2)
    results.append(accuracy)
    names.append(name)
    finalResults.append((name,"%s%%"%accuracy))
In [40]:
finalResults
Out[40]:
[('LR', '60.52%'),
 ('SVC', '60.52%'),
 ('gbm', '61.59%'),
 ('KNN', '51.98%'),
 ('DTC', '55.79%'),
 ('BNB', '60.52%'),
 ('RFC', '64.18%')]

Choose Random Forrest Classifier to implement since it has highest accuracy¶

In [41]:
model = RandomForestClassifier (max_depth=200, random_state=0, n_estimators=10)
In [42]:
# train model with data
model.fit(X_train,y_train.ravel())
Out[42]:
RandomForestClassifier(max_depth=200, n_estimators=10, random_state=0)
In [43]:
# make predictions
y_pred = model.predict(X_test)
df2 = pd.DataFrame(y_pred, columns = ['y-pred'])
vertical_concat = pd.concat([df2, pd.DataFrame(y_test,columns=['label'])], axis = 1 )
vertical_concat.head(10)
Out[43]:
y-pred label
0 0.0 1.0
1 0.0 1.0
2 0.0 1.0
3 0.0 0.0
4 0.0 0.0
5 0.0 0.0
6 0.0 1.0
7 0.0 0.0
8 0.0 1.0
9 0.0 0.0
In [44]:
# comfusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
[[332  65]
 [209  50]]
In [45]:
# accuracy 
score = model.score(X_test, y_test)
print(score*100,'%')
58.231707317073166 %
In [46]:
print (f'Train Accuracy - : {model.score(X_train,y_train)*100:.3f} %')
print (f'Test Accuracy - : {model.score(X_test,y_test)*100:.3f} %')
Train Accuracy - : 97.710 %
Test Accuracy - : 58.232 %
In [47]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2s= r2_score(y_test, y_pred)
print(f' mse = {mse}, mae = {mae}, r2s={r2s}')
 mse = 0.4176829268292683, mae = 0.4176829268292683, r2s=-0.7480913803331939

Hyperparameter tuning via GridSearchCV¶

In [48]:
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the param grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)
{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}
In [49]:
rf_Model = RandomForestClassifier()
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 10, verbose=2, n_jobs = 4)
rf_Grid.fit(X_train, y_train)
Fitting 10 folds for each of 320 candidates, totalling 3200 fits
Out[49]:
GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72,
                                          80]},
             verbose=2)
In [50]:
rf_Grid.best_params_
Out[50]:
{'bootstrap': False,
 'max_depth': 4,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 17}
In [51]:
# put these new parameter values into our model again and re-run:
model = RandomForestClassifier (bootstrap =True,
 max_depth = 2,
 max_features = 'auto',
 min_samples_leaf = 1,
 min_samples_split = 2,
 n_estimators =10)

# train model with data
model.fit(X_train,y_train.ravel())
Out[51]:
RandomForestClassifier(max_depth=2, n_estimators=10)
In [52]:
# make predictions
y_pred = model.predict(X_test)
df2 = pd.DataFrame(y_pred, columns = ['y-pred'])
vertical_concat = pd.concat([df2, pd.DataFrame(y_test,columns=['label'])], axis = 1 )
vertical_concat.head(10)
Out[52]:
y-pred label
0 0.0 1.0
1 0.0 1.0
2 0.0 1.0
3 0.0 0.0
4 0.0 0.0
5 0.0 0.0
6 0.0 1.0
7 0.0 0.0
8 0.0 1.0
9 0.0 0.0
In [53]:
# comfusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
[[394   3]
 [252   7]]
In [54]:
# accuracy 
score = model.score(X_test, y_test)
print(score*100,'%')
61.12804878048781 %
In [55]:
print (f'Train Accuracy - : {model.score(X_train,y_train)*100:.3f} %')
print (f'Test Accuracy - : {model.score(X_test,y_test)*100:.3f} %')
Train Accuracy - : 61.489 %
Test Accuracy - : 61.128 %
In [56]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2s= r2_score(y_test, y_pred)
print(f' mse = {mse}, mae = {mae}, r2s={r2s}')
 mse = 0.38871951219512196, mae = 0.38871951219512196, r2s=-0.6268733649086293

Hyperparameter tuning via RandomizedSearchCV¶

In [57]:
from sklearn.model_selection import RandomizedSearchCV
rf_RandomGrid = RandomizedSearchCV(estimator = rf_Model, param_distributions = param_grid, cv = 10, verbose=2, n_jobs = 4)
In [58]:
rf_RandomGrid.fit(X_train, y_train)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Out[58]:
RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=4,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 4],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2],
                                        'min_samples_split': [2, 5],
                                        'n_estimators': [10, 17, 25, 33, 41, 48,
                                                         56, 64, 72, 80]},
                   verbose=2)
In [59]:
rf_RandomGrid.best_params_
Out[59]:
{'n_estimators': 56,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 4,
 'bootstrap': True}
In [60]:
# put these new parameter values into our model again and re-run:
model = RandomForestClassifier (n_estimators = 10,
 min_samples_split =2,
 min_samples_leaf = 2,
 max_features = 'sqrt',
 max_depth = 4,
 bootstrap = True)

# train model with data
model.fit(X_train,y_train.ravel())
Out[60]:
RandomForestClassifier(max_depth=4, max_features='sqrt', min_samples_leaf=2,
                       n_estimators=10)
In [61]:
# make predictions
y_pred = model.predict(X_test)
df2 = pd.DataFrame(y_pred, columns = ['y-pred'])
vertical_concat = pd.concat([df2, pd.DataFrame(y_test,columns=['label'])], axis = 1 )
vertical_concat.head(10)
Out[61]:
y-pred label
0 0.0 1.0
1 0.0 1.0
2 0.0 1.0
3 0.0 0.0
4 0.0 0.0
5 0.0 0.0
6 0.0 1.0
7 0.0 0.0
8 0.0 1.0
9 0.0 0.0
In [62]:
# comfusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
[[389   8]
 [243  16]]
In [63]:
# accuracy 
score = model.score(X_test, y_test)
print(score*100,'%')
61.737804878048784 %
In [64]:
print (f'Train Accuracy - : {model.score(X_train,y_train)*100:.3f} %')
print (f'Test Accuracy - : {model.score(X_test,y_test)*100:.3f} %')
Train Accuracy - : 64.275 %
Test Accuracy - : 61.738 %
In [65]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2s= r2_score(y_test, y_pred)
print(f' mse = {mse}, mae = {mae}, r2s={r2s}')
 mse = 0.3826219512195122, mae = 0.3826219512195122, r2s=-0.6013537827139841
In [66]:
#import pickle
# #Open a file and use dump() 
#with open('water_quality.pkl', 'wb') as file: 
      
    ## A new file will be created 
    #pickle.dump(model, file)